{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a39fe162",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "使用的公司列: 区\n",
      "使用的薪资列: 单价(元/平米)\n",
      "总行数: 1061, 删除缺失后的行数: 1061, 删除行数: 0\n",
      "注意：金额单位为人民币（元），已按月标准化。若原为年薪则已除以12。\n",
      "示例（前50 行）：\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>company</th>\n",
       "      <th>min_salary</th>\n",
       "      <th>max_salary</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>锦江</td>\n",
       "      <td>176036.0</td>\n",
       "      <td>176036.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>锦江</td>\n",
       "      <td>26959.4</td>\n",
       "      <td>26959.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>锦江</td>\n",
       "      <td>22612.8</td>\n",
       "      <td>22612.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.</td>\n",
       "      <td>18014.5</td>\n",
       "      <td>18014.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>锦江</td>\n",
       "      <td>13513.5</td>\n",
       "      <td>13513.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>锦江</td>\n",
       "      <td>13220.3</td>\n",
       "      <td>13220.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>锦江</td>\n",
       "      <td>43299.0</td>\n",
       "      <td>43299.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>锦江</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>17000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>锦江</td>\n",
       "      <td>17785.5</td>\n",
       "      <td>17785.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>锦江</td>\n",
       "      <td>11199.3</td>\n",
       "      <td>11199.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>锦江</td>\n",
       "      <td>12546.2</td>\n",
       "      <td>12546.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>锦江</td>\n",
       "      <td>39690.4</td>\n",
       "      <td>39690.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>锦江</td>\n",
       "      <td>27712.8</td>\n",
       "      <td>27712.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>锦江</td>\n",
       "      <td>12559.8</td>\n",
       "      <td>12559.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>锦江</td>\n",
       "      <td>18701.7</td>\n",
       "      <td>18701.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>锦江</td>\n",
       "      <td>13636.9</td>\n",
       "      <td>13636.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>锦江</td>\n",
       "      <td>15625.0</td>\n",
       "      <td>15625.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>锦江</td>\n",
       "      <td>17020.0</td>\n",
       "      <td>17020.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>锦江</td>\n",
       "      <td>17251.5</td>\n",
       "      <td>17251.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>锦江</td>\n",
       "      <td>11192.7</td>\n",
       "      <td>11192.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>锦江</td>\n",
       "      <td>17471.3</td>\n",
       "      <td>17471.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>锦江</td>\n",
       "      <td>31179.1</td>\n",
       "      <td>31179.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>锦江</td>\n",
       "      <td>34099.3</td>\n",
       "      <td>34099.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>锦江</td>\n",
       "      <td>12383.3</td>\n",
       "      <td>12383.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>锦江</td>\n",
       "      <td>25800.2</td>\n",
       "      <td>25800.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>锦江</td>\n",
       "      <td>16982.2</td>\n",
       "      <td>16982.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>锦江</td>\n",
       "      <td>19767.4</td>\n",
       "      <td>19767.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>锦江</td>\n",
       "      <td>25581.7</td>\n",
       "      <td>25581.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>锦江</td>\n",
       "      <td>20197.8</td>\n",
       "      <td>20197.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>锦江</td>\n",
       "      <td>15447.2</td>\n",
       "      <td>15447.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>锦江</td>\n",
       "      <td>13832.3</td>\n",
       "      <td>13832.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>锦江</td>\n",
       "      <td>19890.9</td>\n",
       "      <td>19890.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>锦江</td>\n",
       "      <td>31746.0</td>\n",
       "      <td>31746.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>锦江</td>\n",
       "      <td>14970.1</td>\n",
       "      <td>14970.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>锦江</td>\n",
       "      <td>15727.6</td>\n",
       "      <td>15727.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>锦江</td>\n",
       "      <td>21120.1</td>\n",
       "      <td>21120.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>锦江</td>\n",
       "      <td>16444.4</td>\n",
       "      <td>16444.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>锦江</td>\n",
       "      <td>11096.3</td>\n",
       "      <td>11096.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>锦江</td>\n",
       "      <td>32608.7</td>\n",
       "      <td>32608.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>锦江</td>\n",
       "      <td>13959.9</td>\n",
       "      <td>13959.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>锦江</td>\n",
       "      <td>17448.4</td>\n",
       "      <td>17448.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>锦江</td>\n",
       "      <td>19895.8</td>\n",
       "      <td>19895.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>锦江</td>\n",
       "      <td>24042.6</td>\n",
       "      <td>24042.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>锦江</td>\n",
       "      <td>11552.7</td>\n",
       "      <td>11552.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>锦江</td>\n",
       "      <td>19041.1</td>\n",
       "      <td>19041.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>锦江</td>\n",
       "      <td>25239.8</td>\n",
       "      <td>25239.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>锦江</td>\n",
       "      <td>19357.0</td>\n",
       "      <td>19357.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>锦江</td>\n",
       "      <td>12921.9</td>\n",
       "      <td>12921.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>锦江</td>\n",
       "      <td>12772.3</td>\n",
       "      <td>12772.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>锦江</td>\n",
       "      <td>15031.1</td>\n",
       "      <td>15031.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   company  min_salary  max_salary\n",
       "0       锦江    176036.0    176036.0\n",
       "1       锦江     26959.4     26959.4\n",
       "2       锦江     22612.8     22612.8\n",
       "3        .     18014.5     18014.5\n",
       "4       锦江     13513.5     13513.5\n",
       "5       锦江     13220.3     13220.3\n",
       "6       锦江     43299.0     43299.0\n",
       "7       锦江     17000.0     17000.0\n",
       "8       锦江     17785.5     17785.5\n",
       "9       锦江     11199.3     11199.3\n",
       "10      锦江     12546.2     12546.2\n",
       "11      锦江     39690.4     39690.4\n",
       "12      锦江     27712.8     27712.8\n",
       "13      锦江     12559.8     12559.8\n",
       "14      锦江     18701.7     18701.7\n",
       "15      锦江     13636.9     13636.9\n",
       "16      锦江     15625.0     15625.0\n",
       "17      锦江     17020.0     17020.0\n",
       "18      锦江     17251.5     17251.5\n",
       "19      锦江     11192.7     11192.7\n",
       "20      锦江     17471.3     17471.3\n",
       "21      锦江     31179.1     31179.1\n",
       "22      锦江     34099.3     34099.3\n",
       "23      锦江     12383.3     12383.3\n",
       "24      锦江     25800.2     25800.2\n",
       "25      锦江     16982.2     16982.2\n",
       "26      锦江     19767.4     19767.4\n",
       "27      锦江     25581.7     25581.7\n",
       "28      锦江     20197.8     20197.8\n",
       "29      锦江     15447.2     15447.2\n",
       "30      锦江     13832.3     13832.3\n",
       "31      锦江     19890.9     19890.9\n",
       "32      锦江     31746.0     31746.0\n",
       "33      锦江     14970.1     14970.1\n",
       "34      锦江     15727.6     15727.6\n",
       "35      锦江     21120.1     21120.1\n",
       "36      锦江     16444.4     16444.4\n",
       "37      锦江     11096.3     11096.3\n",
       "38      锦江     32608.7     32608.7\n",
       "39      锦江     13959.9     13959.9\n",
       "40      锦江     17448.4     17448.4\n",
       "41      锦江     19895.8     19895.8\n",
       "42      锦江     24042.6     24042.6\n",
       "43      锦江     11552.7     11552.7\n",
       "44      锦江     19041.1     19041.1\n",
       "45      锦江     25239.8     25239.8\n",
       "46      锦江     19357.0     19357.0\n",
       "47      锦江     12921.9     12921.9\n",
       "48      锦江     12772.3     12772.3\n",
       "49      锦江     15031.1     15031.1"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 读取、清洗并解析薪资（执行此单元）\n",
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "\n",
    "# 文件路径（相对当前工作目录）\n",
    "file = Path('handroom.xlsx')\n",
    "if not file.exists():\n",
    "    raise FileNotFoundError(f\"{file} not found in working directory: {Path.cwd()}\")\n",
    "\n",
    "# 读取所有 sheet 并合并\n",
    "sheets = pd.read_excel(file, sheet_name=None)\n",
    "df = pd.concat(sheets.values(), ignore_index=True, sort=False)\n",
    "\n",
    "# 自动识别公司列（优先匹配常见中文/英文字段）\n",
    "company_col = None\n",
    "company_keys = ['公司','企业','公司名称','招聘单位','招聘公司','company','employer','employer_name','company_name','recruiter']\n",
    "for c in df.columns:\n",
    "    cname = str(c).lower()\n",
    "    if any(k in cname for k in company_keys):\n",
    "        company_col = c\n",
    "        break\n",
    "if company_col is None:\n",
    "    # 回退：选择第一个字符串（object）类型的列作为公司列（保守策略）\n",
    "    for c in df.columns:\n",
    "        if df[c].dtype == object:\n",
    "            company_col = c\n",
    "            break\n",
    "if company_col is None:\n",
    "    raise ValueError('无法识别公司列，请检查表头名')\n",
    "print('使用的公司列:', company_col)\n",
    "\n",
    "# 自动识别薪资列\n",
    "salary_col = None\n",
    "salary_keys = ['薪','工资','薪资','money','salary','pay']\n",
    "for c in df.columns:\n",
    "    cname = str(c).lower()\n",
    "    if any(k in cname for k in salary_keys):\n",
    "        salary_col = c\n",
    "        break\n",
    "\n",
    "if salary_col is None:\n",
    "    # 如果没有明显匹配，尝试查找含有数字/字符组合的列名（如包含 10000 等）\n",
    "    for c in df.columns:\n",
    "        if any(ch.isdigit() for ch in str(c)):\n",
    "            salary_col = c\n",
    "            break\n",
    "\n",
    "if salary_col is None:\n",
    "    # 回退：优先选择数值类型的列（排除公司列），这是对\"单价(元/平米)\"等情况的通用处理\n",
    "    for c in df.columns:\n",
    "        if c == company_col:\n",
    "            continue\n",
    "        if pd.api.types.is_numeric_dtype(df[c]):\n",
    "            salary_col = c\n",
    "            break\n",
    "\n",
    "if salary_col is None:\n",
    "    # 最后再尝试通过列名中包含常见货币/价格关键词来匹配\n",
    "    for c in df.columns:\n",
    "        cname = str(c).lower()\n",
    "        if any(k in cname for k in ['价','单价','元','price']):\n",
    "            salary_col = c\n",
    "            break\n",
    "\n",
    "if salary_col is None:\n",
    "    raise ValueError('无法识别薪资列，请检查表头名')\n",
    "print('使用的薪资列:', salary_col)\n",
    "\n",
    "# 标准化公司名与薪资字段为字符串，替换空字符串为 NaN\n",
    "df[company_col] = df[company_col].astype(str).str.strip().replace({'nan': None})\n",
    "df[salary_col] = df[salary_col].astype(str).str.strip().replace({'nan': None})\n",
    "\n",
    "# 统计并删除缺失值（公司或薪资缺失则删除）\n",
    "before_count = len(df)\n",
    "df_clean = df.dropna(subset=[company_col, salary_col])\n",
    "after_count = len(df_clean)\n",
    "print(f'总行数: {before_count}, 删除缺失后的行数: {after_count}, 删除行数: {before_count - after_count}')\n",
    "\n",
    "# 解析薪资字符串的函数：返回 (min, max)（人民币，按月）\n",
    "def parse_salary(s):\n",
    "    if not isinstance(s, str) or s.strip() == '':\n",
    "        return (np.nan, np.nan)\n",
    "    s0 = s.lower()\n",
    "    # 一些直接表示不可用或面议的词\n",
    "    if any(term in s0 for term in ['面议','negoti','待议','不限']):\n",
    "        return (np.nan, np.nan)\n",
    "    # 统一分隔符和符号\n",
    "    s0 = s0.replace('，',',').replace('／','/').replace('—','-').replace('–','-')\n",
    "    # 判断周期：年 or 月\n",
    "    per_year = any(k in s0 for k in ['/年','年','per year','year'])\n",
    "    per_month = any(k in s0 for k in ['/月','月','per month','month'])\n",
    "\n",
    "    # 提取数字和单位（k, 万, 千 等）\n",
    "    def parse_piece(p):\n",
    "        m = re.search(r'(\\d+\\.?\\d*)(\\s*[kK千万万]?)', p)\n",
    "        if not m:\n",
    "            return None\n",
    "        val = float(m.group(1))\n",
    "        unit = m.group(2).strip()\n",
    "        if unit in ['k','K','千']:\n",
    "            val = val * 1000\n",
    "        elif unit in ['万']:\n",
    "            val = val * 10000\n",
    "        return val\n",
    "\n",
    "    # 先尝试用常见的范围分隔符拆分\n",
    "    parts = re.split(r'[-–—to至~]', s0)\n",
    "    nums = []\n",
    "    for p in parts[:2]:\n",
    "        v = parse_piece(p)\n",
    "        if v is not None:\n",
    "            nums.append(v)\n",
    "    # 若未提取到，尝试全局查找三位以上的数字（如 10000）\n",
    "    if len(nums) == 0:\n",
    "        m = re.search(r'(\\d{3,})', s0)\n",
    "        if m:\n",
    "            nums.append(float(m.group(1)))\n",
    "\n",
    "    if len(nums) == 0:\n",
    "        return (np.nan, np.nan)\n",
    "    if len(nums) == 1:\n",
    "        minv = maxv = nums[0]\n",
    "    else:\n",
    "        minv, maxv = nums[0], nums[1]\n",
    "\n",
    "    # 年薪换算为月薪（若字符串含年，但不含月）\n",
    "    if per_year and not per_month:\n",
    "        minv = minv / 12.0\n",
    "        maxv = maxv / 12.0\n",
    "\n",
    "    return (minv, maxv)\n",
    "\n",
    "# 应用解析并生成列\n",
    "parsed = df_clean[salary_col].apply(parse_salary)\n",
    "df_clean = df_clean.assign(min_salary=parsed.apply(lambda x: x[0]), max_salary=parsed.apply(lambda x: x[1]))\n",
    "\n",
    "# 删除无法解析出最低薪资的行（认为缺失）\n",
    "final = df_clean.dropna(subset=['min_salary'])\n",
    "\n",
    "# 构造输出 DataFrame 并重命名列\n",
    "result = final[[company_col, 'min_salary', 'max_salary']].copy()\n",
    "result.columns = ['company', 'min_salary', 'max_salary']\n",
    "# 强制数值类型（若存在小数则保留）\n",
    "result['min_salary'] = result['min_salary'].astype(float)\n",
    "result['max_salary'] = result['max_salary'].astype(float)\n",
    "\n",
    "# 结果说明\n",
    "print('注意：金额单位为人民币（元），已按月标准化。若原为年薪则已除以12。')\n",
    "print('示例（前50 行）：')\n",
    "display(result.head(50))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a7a3919d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "员工表列： ['empno', 'ename', 'job', 'mgr', 'hiredate', 'sal', 'comm', 'deptno']\n",
      "部门表列： ['deptno', 'dname', 'loc']\n",
      "自动识别并使用合并键 -> 员工表: \"deptno\", 部门表: \"deptno\"\n",
      "合并后行数: 14, 列数: 10\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>empno</th>\n",
       "      <th>ename</th>\n",
       "      <th>job</th>\n",
       "      <th>mgr</th>\n",
       "      <th>hiredate</th>\n",
       "      <th>sal</th>\n",
       "      <th>comm</th>\n",
       "      <th>deptno</th>\n",
       "      <th>dname</th>\n",
       "      <th>loc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7369</td>\n",
       "      <td>张金山</td>\n",
       "      <td>职员</td>\n",
       "      <td>7902.0</td>\n",
       "      <td>1980/12/17</td>\n",
       "      <td>800</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>市场部</td>\n",
       "      <td>B区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>7499</td>\n",
       "      <td>艾销</td>\n",
       "      <td>销售员</td>\n",
       "      <td>7698.0</td>\n",
       "      <td>1981/2/20</td>\n",
       "      <td>1600</td>\n",
       "      <td>300.0</td>\n",
       "      <td>30</td>\n",
       "      <td>财务部</td>\n",
       "      <td>C区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7521</td>\n",
       "      <td>伍销</td>\n",
       "      <td>销售员</td>\n",
       "      <td>7698.0</td>\n",
       "      <td>1981/2/22</td>\n",
       "      <td>1250</td>\n",
       "      <td>500.0</td>\n",
       "      <td>30</td>\n",
       "      <td>财务部</td>\n",
       "      <td>C区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7566</td>\n",
       "      <td>李太白</td>\n",
       "      <td>经理</td>\n",
       "      <td>7839.0</td>\n",
       "      <td>1980/4/2</td>\n",
       "      <td>2975</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>市场部</td>\n",
       "      <td>B区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7654</td>\n",
       "      <td>布雷</td>\n",
       "      <td>销售员</td>\n",
       "      <td>7698.0</td>\n",
       "      <td>1981/9/28</td>\n",
       "      <td>1250</td>\n",
       "      <td>1400.0</td>\n",
       "      <td>30</td>\n",
       "      <td>财务部</td>\n",
       "      <td>C区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>7698</td>\n",
       "      <td>马丁</td>\n",
       "      <td>经理</td>\n",
       "      <td>7839.0</td>\n",
       "      <td>1981/1/5</td>\n",
       "      <td>2850</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30</td>\n",
       "      <td>财务部</td>\n",
       "      <td>C区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7782</td>\n",
       "      <td>刘路</td>\n",
       "      <td>经理</td>\n",
       "      <td>7839.0</td>\n",
       "      <td>1981/6/9</td>\n",
       "      <td>2450</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10</td>\n",
       "      <td>技术部</td>\n",
       "      <td>A区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>7788</td>\n",
       "      <td>斯诺</td>\n",
       "      <td>职员</td>\n",
       "      <td>7566.0</td>\n",
       "      <td>1987/3/4</td>\n",
       "      <td>800</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>市场部</td>\n",
       "      <td>B区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>7839</td>\n",
       "      <td>马小云</td>\n",
       "      <td>总经理</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1980/2/1</td>\n",
       "      <td>5000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10</td>\n",
       "      <td>技术部</td>\n",
       "      <td>A区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>7844</td>\n",
       "      <td>许云</td>\n",
       "      <td>销售</td>\n",
       "      <td>7698.0</td>\n",
       "      <td>1981/9/4</td>\n",
       "      <td>1500</td>\n",
       "      <td>0.0</td>\n",
       "      <td>30</td>\n",
       "      <td>财务部</td>\n",
       "      <td>C区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>7876</td>\n",
       "      <td>安德</td>\n",
       "      <td>职员</td>\n",
       "      <td>7788.0</td>\n",
       "      <td>1987/6/3</td>\n",
       "      <td>1100</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>市场部</td>\n",
       "      <td>B区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>7900</td>\n",
       "      <td>简思</td>\n",
       "      <td>职员</td>\n",
       "      <td>7698.0</td>\n",
       "      <td>1981/12/3</td>\n",
       "      <td>950</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30</td>\n",
       "      <td>财务部</td>\n",
       "      <td>C区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>7902</td>\n",
       "      <td>福特</td>\n",
       "      <td>分析师</td>\n",
       "      <td>7566.0</td>\n",
       "      <td>1981/12/3</td>\n",
       "      <td>3000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>市场部</td>\n",
       "      <td>B区</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>7934</td>\n",
       "      <td>米勒</td>\n",
       "      <td>职员</td>\n",
       "      <td>7782.0</td>\n",
       "      <td>1982/1/23</td>\n",
       "      <td>1300</td>\n",
       "      <td>NaN</td>\n",
       "      <td>10</td>\n",
       "      <td>技术部</td>\n",
       "      <td>A区</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    empno ename  job     mgr    hiredate   sal    comm  deptno dname loc\n",
       "0    7369   张金山   职员  7902.0  1980/12/17   800     NaN      20   市场部  B区\n",
       "1    7499    艾销  销售员  7698.0   1981/2/20  1600   300.0      30   财务部  C区\n",
       "2    7521    伍销  销售员  7698.0   1981/2/22  1250   500.0      30   财务部  C区\n",
       "3    7566   李太白   经理  7839.0    1980/4/2  2975     NaN      20   市场部  B区\n",
       "4    7654    布雷  销售员  7698.0   1981/9/28  1250  1400.0      30   财务部  C区\n",
       "5    7698    马丁   经理  7839.0    1981/1/5  2850     NaN      30   财务部  C区\n",
       "6    7782    刘路   经理  7839.0    1981/6/9  2450     NaN      10   技术部  A区\n",
       "7    7788    斯诺   职员  7566.0    1987/3/4   800     NaN      20   市场部  B区\n",
       "8    7839   马小云  总经理     NaN    1980/2/1  5000     NaN      10   技术部  A区\n",
       "9    7844    许云   销售  7698.0    1981/9/4  1500     0.0      30   财务部  C区\n",
       "10   7876    安德   职员  7788.0    1987/6/3  1100     NaN      20   市场部  B区\n",
       "11   7900    简思   职员  7698.0   1981/12/3   950     NaN      30   财务部  C区\n",
       "12   7902    福特  分析师  7566.0   1981/12/3  3000     NaN      20   市场部  B区\n",
       "13   7934    米勒   职员  7782.0   1982/1/23  1300     NaN      10   技术部  A区"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "已将合并结果保存到: d:\\Git\\my-warehouse\\1115\\merged_employee_department.csv\n"
     ]
    }
   ],
   "source": [
    "# 合并员工与部门数据并显示结果（执行此单元）\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import sys\n",
    "\n",
    "def read_csv_try(path):\n",
    "    # 尝试 utf-8，然后 gbk 编码（兼容 Windows 常见编码）\n",
    "    try:\n",
    "        return pd.read_csv(path, encoding='utf-8')\n",
    "    except Exception:\n",
    "        try:\n",
    "            return pd.read_csv(path, encoding='gbk')\n",
    "        except Exception as e:\n",
    "            raise IOError(f'读取 {path} 失败: {e}')\n",
    "\n",
    "base = Path.cwd()\n",
    "emp_path = base / '员工表.csv'\n",
    "dept_path = base / '部门表.csv'\n",
    "if not emp_path.exists():\n",
    "    raise FileNotFoundError(f'未找到员工文件: {emp_path}')\n",
    "if not dept_path.exists():\n",
    "    raise FileNotFoundError(f'未找到部门文件: {dept_path}')\n",
    "\n",
    "emp = read_csv_try(emp_path)\n",
    "dept = read_csv_try(dept_path)\n",
    "\n",
    "print('员工表列：', list(emp.columns))\n",
    "print('部门表列：', list(dept.columns))\n",
    "\n",
    "# 智能识别合并键（优先常见 id 字段，其次列名包含关键词，其次通过值重叠匹配）\n",
    "def find_keys(left_df, right_df):\n",
    "    left_cols = [c for c in left_df.columns]\n",
    "    right_cols = [c for c in right_df.columns]\n",
    "    # 标准化小写映射\n",
    "    lmap = {c: c.lower() for c in left_cols}\n",
    "    rmap = {c: c.lower() for c in right_cols}\n",
    "    # 常见候选名\n",
    "    candidates = ['dept_id','department_id','部门id','部门编号','部门编码','deptno','dept_id','dept','部门','deptcode','department']\n",
    "    # 1) 寻找同名（忽略大小写）列\n",
    "    common = []\n",
    "    for lc,lcl in lmap.items():\n",
    "        for rc,rcl in rmap.items():\n",
    "            if lcl == rcl:\n",
    "                common.append((lc, rc))\n",
    "    if common:\n",
    "        return common[0]\n",
    "    # 2) 寻找包含候选关键词的列\n",
    "    for kw in candidates:\n",
    "        for lc,lcl in lmap.items():\n",
    "            if kw in lcl:\n",
    "                for rc,rcl in rmap.items():\n",
    "                    if kw in rcl or any(x in rcl for x in ['name','名称']):\n",
    "                        return (lc, rc)\n",
    "    # 3) 值重叠启发式：找 string/object 列对，若交集非空且比例>0.1 则使用\n",
    "    def string_cols(df):\n",
    "        return [c for c in df.columns if df[c].dtype == object or pd.api.types.is_string_dtype(df[c])]\n",
    "    lstr = string_cols(left_df)\n",
    "    rstr = string_cols(right_df)\n",
    "    for lc in lstr:\n",
    "        lvals = set([str(x).strip() for x in left_df[lc].dropna().unique() if str(x).strip()!=''])\n",
    "        if not lvals:\n",
    "            continue\n",
    "        for rc in rstr:\n",
    "            rvals = set([str(x).strip() for x in right_df[rc].dropna().unique() if str(x).strip()!=''])\n",
    "            if not rvals:\n",
    "                continue\n",
    "            inter = lvals & rvals\n",
    "            if len(inter) > 0 and (len(inter) / max(1, min(len(lvals), len(rvals)))) > 0.05:\n",
    "                return (lc, rc)\n",
    "    # 4) 没有找到合适键，返回 None\n",
    "    return None\n",
    "\n",
    "keys = find_keys(emp, dept)\n",
    "if keys is None:\n",
    "    print('\\n无法自动识别合并键。请检查列名并手动指定合并键，例如：')\n",
    "    print('员工表可能的列：', list(emp.columns))\n",
    "    print('部门表可能的列：', list(dept.columns))\n",
    "    raise ValueError('无法自动识别合并键，请手动检查列名。')\n",
    "left_on, right_on = keys\n",
    "print(f'自动识别并使用合并键 -> 员工表: \"{left_on}\", 部门表: \"{right_on}\"')\n",
    "\n",
    "# 执行合并（以员工表为主，左连接）\n",
    "merged = emp.merge(dept, how='left', left_on=left_on, right_on=right_on, suffixes=('_emp', '_dept'))\n",
    "print(f'合并后行数: {len(merged)}, 列数: {merged.shape[1]}')\n",
    "\n",
    "# 展示前 100 行以便检查\n",
    "from IPython.display import display\n",
    "display(merged.head(100))\n",
    "\n",
    "# 保存结果到 CSV\n",
    "out_path = base / 'merged_employee_department.csv'\n",
    "try:\n",
    "    merged.to_csv(out_path, index=False, encoding='utf-8-sig')\n",
    "    print(f'已将合并结果保存到: {out_path}')\n",
    "except Exception as e:\n",
    "    print('保存 CSV 失败:', e)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a494670f",
   "metadata": {},
   "source": [
    "## 任务1.3：二手房数据综合处理说明\n",
    "数据来源：`handroom.xlsx`（不同 sheet 存储）。\n",
    "本单元实现以下功能：\n",
    "- 读取所有 sheet 并合并为一个 DataFrame。\n",
    "- 字段拆分：将房屋建造年份/房龄、面积、楼层类型、户型（室/厅）等从原字段中拆分成独立列（尽量自动识别列名）。\n",
    "- 缺失值处理：删除小区名称（小区/社区）为空的行。\n",
    "- 重复值处理：检测并删除重复行，重新索引。\n",
    "- 异常值处理：针对“小区 = 翡翠城四期”按价格列（自动识别）检测并删除异常值（使用 IQR 方法）。\n",
    "- 统计分析：统计每个建造年份的房屋数、不同楼层类型的数量、每个小区的房屋平均价。\n",
    "输出：清洗后的 DataFrame（显示部分结果）并保存为 `handroom_cleaned.csv`。\n",
    "注意：本单元将尽量智能识别相关列（小区、价格、面积、建造年份/房龄、楼层、户型），若识别失败会打印候选列并提示手动指定。请在运行前确认 `handroom.xlsx` 存在于当前工作目录，或修改路径后再运行。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69dea9ef",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "原始数据行数: 1061 列数: 7\n",
      "列名： ['区', '小区名称', '标题', '房屋信息', '关注', '地铁', '单价(元/平米)']\n",
      "使用的小区列: 小区名称\n",
      "删除小区缺失：2 行，剩余 1059 行\n",
      "使用的价格列: 单价(元/平米)\n",
      "识别的面积列: 单价(元/平米)\n",
      "识别的建造年份/房龄列: None\n",
      "识别的楼层列: None\n",
      "识别的户型列: None\n",
      "删除重复行数: 61\n",
      "在\"翡翠城四期\"中，移除异常前 21 行，移除后 20 行\n",
      "已保存清洗后数据到: handroom_cleaned.csv\n",
      "\n",
      "统计：每个建造年份的房屋数（按 build_year）\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Series([], Name: count, dtype: int64)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "统计：不同楼层类型的数量（按原楼层字段显示）\n",
      "未识别到楼层列\n",
      "\n",
      "统计：每个小区的房屋平均价（price_val）——按小区分组，取均值，显示前50\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "小区名称\n",
       "四海逸家二期       43299.000000\n",
       "皇后国际         42703.500000\n",
       "铁狮门晶融汇       40634.233333\n",
       "时代豪庭三期       37793.100000\n",
       "西部国际金融中心     37512.100000\n",
       "锦江明珠         36633.000000\n",
       "华润幸福里一期      36103.700000\n",
       "仁恒滨河湾        35905.766667\n",
       "时代豪庭四期       35555.300000\n",
       "蓉上坊二期        35235.500000\n",
       "雕墅           34729.200000\n",
       "财富中心         34615.666667\n",
       "时代豪庭二期       33760.650000\n",
       "翡翠城三期        33163.000000\n",
       "翡翠城二期        33160.633333\n",
       "茂业豪园         32936.500000\n",
       "朗御           32605.014286\n",
       "恒大都汇华庭       32503.806250\n",
       "中粮鸿云         32184.200000\n",
       "红星国际         32060.900000\n",
       "晶蓝半岛一期       31962.000000\n",
       "南府街50号       31706.600000\n",
       "乐天圣苑         31488.357143\n",
       "时代豪庭一期       31389.885714\n",
       "万科城市花园       31251.666667\n",
       "世代锦江凯旋门      31179.100000\n",
       "指挥街36号       31130.200000\n",
       "翡翠城五期        30900.000000\n",
       "塔子山壹号        30801.250000\n",
       "瑞升望江橡树林三期    30755.300000\n",
       "德馨苑          30723.100000\n",
       "南府街27号       30597.700000\n",
       "东顺苑          30384.700000\n",
       "青石桥南街50号     30363.300000\n",
       "翰林美居         30250.450000\n",
       "中新街60号       30024.500000\n",
       "中电大厦         29994.000000\n",
       "春禧商务酒店       29635.600000\n",
       "COSMO天廊      29197.100000\n",
       "锦江逸家         28969.671429\n",
       "瑞升望江橡树林二期    28870.840000\n",
       "学道街42号       28547.800000\n",
       "大王钰城         28527.500000\n",
       "东湖国际         28059.472727\n",
       "天誉花园         28027.112500\n",
       "中莲池横街8号      27888.900000\n",
       "万科金润华府       27437.616667\n",
       "东安南路2号院      27314.800000\n",
       "中房优山PLUS     26617.650000\n",
       "翡翠城四期        26465.740000\n",
       "Name: price_val, dtype: float64"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "示例数据（前50行）:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>区</th>\n",
       "      <th>小区名称</th>\n",
       "      <th>标题</th>\n",
       "      <th>房屋信息</th>\n",
       "      <th>关注</th>\n",
       "      <th>地铁</th>\n",
       "      <th>单价(元/平米)</th>\n",
       "      <th>bedrooms</th>\n",
       "      <th>livingrooms</th>\n",
       "      <th>area_sqm</th>\n",
       "      <th>build_year</th>\n",
       "      <th>price_val</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>锦江</td>\n",
       "      <td>时代豪庭一期</td>\n",
       "      <td>时代豪庭套三 中间楼层 有装修 业主处理资产出售</td>\n",
       "      <td>中楼层(共38层)| 2009年建 |3室1厅 | 155.79平米| 东南</td>\n",
       "      <td>137人关注/ 5月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26959.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26959.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26959.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>锦江</td>\n",
       "      <td>卓锦城六期</td>\n",
       "      <td>卓锦城六期紫郡房源，套三，进门带入户</td>\n",
       "      <td>中楼层(共31层)| 2014年建 |3室1厅 | 89.33平米| 西南</td>\n",
       "      <td>36人关注 / 23天前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>22612.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>22612.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>22612.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>.</td>\n",
       "      <td>星城银座</td>\n",
       "      <td>春熙路太古里标准套一出售，现租给民宿。</td>\n",
       "      <td>高楼层(共11层) | 2003年建 | 1室0厅 | 51.07平米 | 南</td>\n",
       "      <td>29人关注 / 5月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>18014.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18014.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18014.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>锦江</td>\n",
       "      <td>新莲新苑</td>\n",
       "      <td>新莲新苑优质套三，诚心出售，近沙河，采光视野好。</td>\n",
       "      <td>高楼层(共7层) | 2001年建 | 3室1厅 | 77.7平米 | 东南</td>\n",
       "      <td>14人关注 / 5月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13513.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13513.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13513.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>锦江</td>\n",
       "      <td>俊发星雅俊园</td>\n",
       "      <td>星雅俊园优质平层公寓，业主精装修，套二带家具家电</td>\n",
       "      <td>低楼层(共37层) | 2015年建 | 2室1厅 | 59平米 | 西南</td>\n",
       "      <td>80人关注 / 4月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13220.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13220.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13220.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>锦江</td>\n",
       "      <td>四海逸家二期</td>\n",
       "      <td>四海二期 跃层双卫户型  套四带大阳台 带暖气</td>\n",
       "      <td>高楼层(共34层) | 2013年建 | 4室2厅 | 97平米 | 东南</td>\n",
       "      <td>106人关注 / 11月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>43299.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>43299.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>43299.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>锦江</td>\n",
       "      <td>万科金色城市二期</td>\n",
       "      <td>明厨明卫采光好，户型方正，精装修，视野开阔。</td>\n",
       "      <td>低楼层(共29层) | 2017年建 | 3室2厅 | 80平米 | 东南 西北</td>\n",
       "      <td>206人关注 / 5月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17000.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>锦江</td>\n",
       "      <td>流星花园</td>\n",
       "      <td>此房安静不临街，交通便利，出后门就是远洋太古里</td>\n",
       "      <td>中楼层(共16层) | 2003年建 | 3室1厅 | 132.13平米 | 东 北</td>\n",
       "      <td>126人关注 / 5月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>17785.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17785.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17785.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>锦江</td>\n",
       "      <td>牛市口路56号</td>\n",
       "      <td>二环内 地铁口  套二  户型方正</td>\n",
       "      <td>高楼层(共7层) | 2000年建 | 2室1厅 | 64.29平米 | 东南</td>\n",
       "      <td>217人关注 / 12月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11199.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11199.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11199.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>锦江</td>\n",
       "      <td>萃锦东路342号</td>\n",
       "      <td>沙河边小套二，户型方正，居家装修</td>\n",
       "      <td>高楼层(共6层) | 2017年建 | 2室2厅 | 62.17平米 | 东南</td>\n",
       "      <td>105人关注 / 8月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12546.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12546.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12546.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>锦江</td>\n",
       "      <td>恒大都汇华庭</td>\n",
       "      <td>市中 心 太古里 都汇华庭 精装套四 顶跃 诚心出售</td>\n",
       "      <td>高楼层(共50层) | 2015年建 | 4室2厅 | 201.56平米 | 西北</td>\n",
       "      <td>26人关注 / 8月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>39690.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>39690.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>39690.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>锦江</td>\n",
       "      <td>乐天圣苑</td>\n",
       "      <td>乐天精装套三 户型方正保养好 可拎包入住</td>\n",
       "      <td>中楼层(共33层) | 2017年建 | 3室1厅 | 81.19平米 | 南</td>\n",
       "      <td>94人关注 / 5月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27712.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27712.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>27712.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>锦江</td>\n",
       "      <td>紫东梵谷</td>\n",
       "      <td>紫东梵谷套二主卧带阳台安静不吵</td>\n",
       "      <td>中楼层(共18层) | 2007年建 | 2室2厅 | 83.6平米 | 西北</td>\n",
       "      <td>50人关注 / 4月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12559.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12559.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12559.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>锦江</td>\n",
       "      <td>国嘉新视界</td>\n",
       "      <td>国嘉新视界套二出售    户型方正不临街   适合居家</td>\n",
       "      <td>高楼层(共32层) | 2009年建 | 2室1厅 | 55.61平米 | 东北</td>\n",
       "      <td>114人关注 / 8月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>18701.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18701.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>18701.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>锦江</td>\n",
       "      <td>上东锦城A区</td>\n",
       "      <td>户型方正，客厅带阳台，对中庭，安静不临街</td>\n",
       "      <td>中楼层(共34层) | 2008年建 | 2室2厅 | 82.13平米 | 东南</td>\n",
       "      <td>168人关注 / 5月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13636.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13636.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13636.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>锦江</td>\n",
       "      <td>皇经花园B区</td>\n",
       "      <td>皇经花园精装跃层套三，楼顶带花园。</td>\n",
       "      <td>高楼层(共6层) | 2007年建 | 3室1厅 | 96平米 | 南</td>\n",
       "      <td>164人关注 / 1年前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15625.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15625.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15625.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>锦江</td>\n",
       "      <td>南府锦</td>\n",
       "      <td>九眼桥跃层套三改套二可看河景，视野好</td>\n",
       "      <td>中楼层(共17层) | 2004年建 | 3室2厅 | 133.96平米 | 东北</td>\n",
       "      <td>83人关注 / 4月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17020.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17020.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17020.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>锦江</td>\n",
       "      <td>翡翠城四期</td>\n",
       "      <td>翡翠城四期 跃层双卫户型 可看沙河 采光视野好</td>\n",
       "      <td>低楼层(共29层) | 2009年建 | 2室2厅 | 85.21平米 | 东南</td>\n",
       "      <td>110人关注 / 6月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>17251.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17251.5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17251.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>锦江</td>\n",
       "      <td>玉沙路134号</td>\n",
       "      <td>玉沙路134号                 套三出售</td>\n",
       "      <td>高楼层(共6层) | 1990年建 | 3室2厅 | 111.68平米 | 东南</td>\n",
       "      <td>130人关注 / 9月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>11192.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11192.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11192.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>锦江</td>\n",
       "      <td>万科金色城市</td>\n",
       "      <td>万科金色城市标准套三，开发商统一精装。</td>\n",
       "      <td>低楼层(共35层) | 2016年建 | 3室2厅 | 87平米 | 东</td>\n",
       "      <td>80人关注 / 9月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>17471.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17471.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17471.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>锦江</td>\n",
       "      <td>世代锦江凯旋门</td>\n",
       "      <td>世代锦江凯旋门 河景朝南大套二 可做套三 户型好</td>\n",
       "      <td>高楼层(共29层) | 2008年建 | 2室2厅 | 123.48平米 | 南</td>\n",
       "      <td>93人关注 / 4月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>31179.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31179.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31179.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>锦江</td>\n",
       "      <td>红星国际</td>\n",
       "      <td>红星国际小户型+明厨明卫采光好+双窗户</td>\n",
       "      <td>低楼层(共34层) | 2012年建 | 1室1厅 | 40.47平米 | 东南</td>\n",
       "      <td>50人关注 / 11月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>34099.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34099.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34099.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>锦江</td>\n",
       "      <td>城市理想</td>\n",
       "      <td>城市理想公寓、四楼带空中花园。</td>\n",
       "      <td>高楼层(共28层) | 2010年建 | 1室1厅 | 52.49平米 | 东南</td>\n",
       "      <td>34人关注 / 10月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>12383.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12383.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12383.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>锦江</td>\n",
       "      <td>ICC凯旋门</td>\n",
       "      <td>清水套三双卫，客厅带阳台，层高3.2米</td>\n",
       "      <td>高楼层(共43层) | 2018年建 | 3室2厅 | 133.72平米 | 北</td>\n",
       "      <td>95人关注 / 2月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>25800.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25800.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25800.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>锦江</td>\n",
       "      <td>锦江城市花园二期</td>\n",
       "      <td>锦江城市花园二期精装套二，有主卧带生活阳台。</td>\n",
       "      <td>低楼层(共34层) | 2010年建 | 2室1厅 | 51.23平米 | 南</td>\n",
       "      <td>60人关注 / 4月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16982.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16982.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16982.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>锦江</td>\n",
       "      <td>东御佲家</td>\n",
       "      <td>全屋品牌轻奢装修，家具家电配齐，拎包入住！</td>\n",
       "      <td>低楼层(共27层) | 2017年建 | 1室1厅 | 43平米 | 东南</td>\n",
       "      <td>50人关注 / 2月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19767.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19767.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19767.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>锦江</td>\n",
       "      <td>美华大厦</td>\n",
       "      <td>盐道街  标准套二 精装修 有平台 可直接入住</td>\n",
       "      <td>中楼层(共16层) | 1999年建 | 2室1厅 | 73.49平米 | 南</td>\n",
       "      <td>99人关注 / 9月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>25581.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25581.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25581.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>锦江</td>\n",
       "      <td>华宇广场</td>\n",
       "      <td>华宇清水优质套二房东诚心出售中</td>\n",
       "      <td>中楼层(共32层) | 2017年建 | 3室2厅 | 71.79平米 | 东北</td>\n",
       "      <td>92人关注 / 4月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>20197.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20197.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20197.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>锦江</td>\n",
       "      <td>卓锦城一期</td>\n",
       "      <td>标准套三 一梯两户  视野好！</td>\n",
       "      <td>高楼层(共6层) | 2006年建 | 3室1厅 | 123平米 | 东南</td>\n",
       "      <td>110人关注 / 10月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15447.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15447.2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15447.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>锦江</td>\n",
       "      <td>柳江新居五期</td>\n",
       "      <td>琉璃场柳江新居五期带装修套二，房东诚心出售</td>\n",
       "      <td>低楼层(共18层) | 2010年建 | 2室1厅 | 80.97平米 | 南 北</td>\n",
       "      <td>58人关注 / 3月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>13832.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13832.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13832.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>锦江</td>\n",
       "      <td>锦洲花园</td>\n",
       "      <td>此房是锦洲花园中庭跃层，顶楼带大花园</td>\n",
       "      <td>高楼层(共6层) | 2004年建 | 3室2厅 | 124.68平米 | 西北</td>\n",
       "      <td>135人关注 / 5月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19890.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19890.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19890.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>锦江</td>\n",
       "      <td>万科城市花园</td>\n",
       "      <td>万科城市花园叠拼洋房有花园。。。。。。。。。。</td>\n",
       "      <td>低楼层(共3层) | 2007年建 | 3室1厅 | 112.14平米 | 东南</td>\n",
       "      <td>111人关注 / 2年前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>31746.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31746.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31746.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>锦江</td>\n",
       "      <td>皇城公寓</td>\n",
       "      <td>青羊区人民南路一段97号 两室一厅一厨一卫</td>\n",
       "      <td>中楼层(共24层) | 2003年建 | 2室1厅 | 86.84平米 | 南</td>\n",
       "      <td>87人关注 / 9月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>14970.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14970.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14970.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>锦江</td>\n",
       "      <td>东洪广厦</td>\n",
       "      <td>高楼层，清水，视野开阔，客厅带阳台</td>\n",
       "      <td>高楼层(共34层) | 2013年建 | 2室1厅 | 70.64平米 | 东南</td>\n",
       "      <td>66人关注 / 5月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>15727.6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15727.6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15727.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>锦江</td>\n",
       "      <td>锦江国际花园</td>\n",
       "      <td>锦江国际花园 精装套三 动静分区 户型方正</td>\n",
       "      <td>中楼层(共42层) | 2016年建 | 3室2厅 | 84.28平米 | 东南</td>\n",
       "      <td>294人关注 / 1年前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>21120.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21120.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21120.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>锦江</td>\n",
       "      <td>华都美林湾</td>\n",
       "      <td>华都美林湾套二  对中庭  可看湖</td>\n",
       "      <td>高楼层(共33层) | 2010年建 | 2室1厅 | 90平米 | 西南</td>\n",
       "      <td>78人关注 / 7月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16444.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16444.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16444.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>锦江</td>\n",
       "      <td>东恒国际</td>\n",
       "      <td>东恒国际，2号线牛王庙站 临春熙路太古里，套二出售</td>\n",
       "      <td>低楼层(共22层) | 2007年建 | 2室1厅 | 67.59平米 | 南</td>\n",
       "      <td>42人关注 / 4月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11096.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11096.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11096.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>锦江</td>\n",
       "      <td>天誉花园</td>\n",
       "      <td>天誉花园大平层，有装修，有地暖，居家好房</td>\n",
       "      <td>高楼层(共31层) | 2016年建 | 5室2厅 | 230平米 | 东南</td>\n",
       "      <td>87人关注 / 7月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32608.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32608.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>32608.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>锦江</td>\n",
       "      <td>紫东梵谷</td>\n",
       "      <td>三环内，标准套二，居家装修，家具家电九成新</td>\n",
       "      <td>高楼层(共18层) | 2007年建 | 2室1厅 | 80.23平米 | 东南</td>\n",
       "      <td>49人关注 / 4月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13959.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13959.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13959.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>锦江</td>\n",
       "      <td>蓝润V客东都二期</td>\n",
       "      <td>蓝润V客东都二期 2室2厅 东南</td>\n",
       "      <td>高楼层(共38层) | 2017年建 | 2室2厅 | 62.47平米 | 东南</td>\n",
       "      <td>33人关注 / 5月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>17448.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17448.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>17448.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>锦江</td>\n",
       "      <td>沙河壹号二期</td>\n",
       "      <td>二环边，成熟社区，居家温馨小三居，卧室带大阳台！</td>\n",
       "      <td>高楼层(共34层) | 2013年建 | 3室2厅 | 84.44平米 | 东</td>\n",
       "      <td>54人关注 / 3月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19895.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19895.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19895.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>锦江</td>\n",
       "      <td>东湖国际</td>\n",
       "      <td>东湖国际 套一 房东急售 低于市场价</td>\n",
       "      <td>高楼层(共33层) | 2012年建 | 1室1厅 | 47平米 | 东南</td>\n",
       "      <td>120人关注 / 10月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24042.6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24042.6</td>\n",
       "      <td>NaN</td>\n",
       "      <td>24042.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>锦江</td>\n",
       "      <td>大观苑二期</td>\n",
       "      <td>大观苑二期，套一，满五年没有增值税</td>\n",
       "      <td>高楼层(共6层) | 2011年建 | 1室1厅 | 43.28平米 | 东南</td>\n",
       "      <td>80人关注 / 6月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11552.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11552.7</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11552.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>锦江</td>\n",
       "      <td>望江锦园</td>\n",
       "      <td>望江锦园 精装 标准套二  保养很好</td>\n",
       "      <td>高楼层(共35层) 2室2厅 | 73平米 | 东南</td>\n",
       "      <td>43人关注 / 9月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19041.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19041.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19041.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>锦江</td>\n",
       "      <td>朝阳逸景</td>\n",
       "      <td>朝阳逸景精装套二带花园电梯小高层</td>\n",
       "      <td>低楼层(共11层) | 2004年建 | 2室1厅 | 79.24平米 | 西北</td>\n",
       "      <td>14人关注 / 5月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25239.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25239.8</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25239.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>锦江</td>\n",
       "      <td>府河名居(锦江)</td>\n",
       "      <td>东门大桥旁标准套三，宜居家，交通便利。</td>\n",
       "      <td>中楼层(共9层) | 2000年建 | 3室1厅 | 88.34平米 | 东南</td>\n",
       "      <td>126人关注 / 7月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>19357.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19357.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>19357.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>锦江</td>\n",
       "      <td>东新街5号</td>\n",
       "      <td>一环内，锦江区  中间楼层小套三</td>\n",
       "      <td>中楼层(共8层) 3室1厅 | 65.78平米 | 西南</td>\n",
       "      <td>11人关注 / 2月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>12921.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12921.9</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12921.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>锦江</td>\n",
       "      <td>王府井</td>\n",
       "      <td>春熙路旁边+电梯+住宅+套三....</td>\n",
       "      <td>高楼层(共25层) | 2000年建 | 3室2厅 | 140.93平米 | 南 西</td>\n",
       "      <td>32人关注 / 8月前发布</td>\n",
       "      <td>近地铁</td>\n",
       "      <td>12772.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12772.3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12772.3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>锦江</td>\n",
       "      <td>人居锦尚春天B区</td>\n",
       "      <td>人居锦尚春天B区精装大套三户型方正</td>\n",
       "      <td>低楼层(共41层) | 2013年建 | 3室2厅 | 93.14平米 | 西北</td>\n",
       "      <td>79人关注 / 5月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15031.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15031.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15031.1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>锦江</td>\n",
       "      <td>鑫苑名家二期</td>\n",
       "      <td>鑫苑名家二期居家装修。采光好，东南朝向</td>\n",
       "      <td>高楼层(共34层) | 2012年建 | 2室1厅 | 79平米 | 东南</td>\n",
       "      <td>57人关注 / 5月前发布</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15949.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15949.4</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15949.4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     区      小区名称                            标题  \\\n",
       "1   锦江    时代豪庭一期      时代豪庭套三 中间楼层 有装修 业主处理资产出售   \n",
       "2   锦江     卓锦城六期            卓锦城六期紫郡房源，套三，进门带入户   \n",
       "3    .      星城银座           春熙路太古里标准套一出售，现租给民宿。   \n",
       "4   锦江      新莲新苑      新莲新苑优质套三，诚心出售，近沙河，采光视野好。   \n",
       "5   锦江    俊发星雅俊园      星雅俊园优质平层公寓，业主精装修，套二带家具家电   \n",
       "6   锦江    四海逸家二期       四海二期 跃层双卫户型  套四带大阳台 带暖气   \n",
       "7   锦江  万科金色城市二期        明厨明卫采光好，户型方正，精装修，视野开阔。   \n",
       "8   锦江      流星花园       此房安静不临街，交通便利，出后门就是远洋太古里   \n",
       "9   锦江   牛市口路56号             二环内 地铁口  套二  户型方正   \n",
       "10  锦江  萃锦东路342号              沙河边小套二，户型方正，居家装修   \n",
       "11  锦江    恒大都汇华庭    市中 心 太古里 都汇华庭 精装套四 顶跃 诚心出售   \n",
       "12  锦江      乐天圣苑          乐天精装套三 户型方正保养好 可拎包入住   \n",
       "13  锦江      紫东梵谷               紫东梵谷套二主卧带阳台安静不吵   \n",
       "14  锦江     国嘉新视界   国嘉新视界套二出售    户型方正不临街   适合居家   \n",
       "15  锦江    上东锦城A区          户型方正，客厅带阳台，对中庭，安静不临街   \n",
       "16  锦江    皇经花园B区             皇经花园精装跃层套三，楼顶带花园。   \n",
       "17  锦江       南府锦            九眼桥跃层套三改套二可看河景，视野好   \n",
       "18  锦江     翡翠城四期       翡翠城四期 跃层双卫户型 可看沙河 采光视野好   \n",
       "19  锦江   玉沙路134号  玉沙路134号                 套三出售   \n",
       "20  锦江    万科金色城市           万科金色城市标准套三，开发商统一精装。   \n",
       "21  锦江   世代锦江凯旋门      世代锦江凯旋门 河景朝南大套二 可做套三 户型好   \n",
       "22  锦江      红星国际           红星国际小户型+明厨明卫采光好+双窗户   \n",
       "23  锦江      城市理想               城市理想公寓、四楼带空中花园。   \n",
       "24  锦江    ICC凯旋门           清水套三双卫，客厅带阳台，层高3.2米   \n",
       "25  锦江  锦江城市花园二期        锦江城市花园二期精装套二，有主卧带生活阳台。   \n",
       "26  锦江      东御佲家         全屋品牌轻奢装修，家具家电配齐，拎包入住！   \n",
       "27  锦江      美华大厦       盐道街  标准套二 精装修 有平台 可直接入住   \n",
       "28  锦江      华宇广场               华宇清水优质套二房东诚心出售中   \n",
       "29  锦江     卓锦城一期               标准套三 一梯两户  视野好！   \n",
       "30  锦江    柳江新居五期         琉璃场柳江新居五期带装修套二，房东诚心出售   \n",
       "31  锦江      锦洲花园            此房是锦洲花园中庭跃层，顶楼带大花园   \n",
       "32  锦江    万科城市花园       万科城市花园叠拼洋房有花园。。。。。。。。。。   \n",
       "33  锦江      皇城公寓         青羊区人民南路一段97号 两室一厅一厨一卫   \n",
       "34  锦江      东洪广厦             高楼层，清水，视野开阔，客厅带阳台   \n",
       "35  锦江    锦江国际花园         锦江国际花园 精装套三 动静分区 户型方正   \n",
       "36  锦江     华都美林湾             华都美林湾套二  对中庭  可看湖   \n",
       "37  锦江      东恒国际     东恒国际，2号线牛王庙站 临春熙路太古里，套二出售   \n",
       "38  锦江      天誉花园          天誉花园大平层，有装修，有地暖，居家好房   \n",
       "39  锦江      紫东梵谷         三环内，标准套二，居家装修，家具家电九成新   \n",
       "40  锦江  蓝润V客东都二期              蓝润V客东都二期 2室2厅 东南   \n",
       "41  锦江    沙河壹号二期      二环边，成熟社区，居家温馨小三居，卧室带大阳台！   \n",
       "42  锦江      东湖国际            东湖国际 套一 房东急售 低于市场价   \n",
       "43  锦江     大观苑二期             大观苑二期，套一，满五年没有增值税   \n",
       "44  锦江      望江锦园            望江锦园 精装 标准套二  保养很好   \n",
       "45  锦江      朝阳逸景              朝阳逸景精装套二带花园电梯小高层   \n",
       "46  锦江  府河名居(锦江)           东门大桥旁标准套三，宜居家，交通便利。   \n",
       "47  锦江     东新街5号              一环内，锦江区  中间楼层小套三   \n",
       "48  锦江       王府井            春熙路旁边+电梯+住宅+套三....   \n",
       "49  锦江  人居锦尚春天B区             人居锦尚春天B区精装大套三户型方正   \n",
       "50  锦江    鑫苑名家二期           鑫苑名家二期居家装修。采光好，东南朝向   \n",
       "\n",
       "                                          房屋信息               关注   地铁  \\\n",
       "1       中楼层(共38层)| 2009年建 |3室1厅 | 155.79平米| 东南    137人关注/ 5月前发布  NaN   \n",
       "2        中楼层(共31层)| 2014年建 |3室1厅 | 89.33平米| 西南   36人关注 / 23天前发布  NaN   \n",
       "3      高楼层(共11层) | 2003年建 | 1室0厅 | 51.07平米 | 南    29人关注 / 5月前发布  近地铁   \n",
       "4       高楼层(共7层) | 2001年建 | 3室1厅 | 77.7平米 | 东南    14人关注 / 5月前发布  NaN   \n",
       "5        低楼层(共37层) | 2015年建 | 2室1厅 | 59平米 | 西南    80人关注 / 4月前发布  NaN   \n",
       "6        高楼层(共34层) | 2013年建 | 4室2厅 | 97平米 | 东南  106人关注 / 11月前发布  NaN   \n",
       "7     低楼层(共29层) | 2017年建 | 3室2厅 | 80平米 | 东南 西北   206人关注 / 5月前发布  近地铁   \n",
       "8   中楼层(共16层) | 2003年建 | 3室1厅 | 132.13平米 | 东 北   126人关注 / 5月前发布  近地铁   \n",
       "9      高楼层(共7层) | 2000年建 | 2室1厅 | 64.29平米 | 东南  217人关注 / 12月前发布  NaN   \n",
       "10     高楼层(共6层) | 2017年建 | 2室2厅 | 62.17平米 | 东南   105人关注 / 8月前发布  NaN   \n",
       "11   高楼层(共50层) | 2015年建 | 4室2厅 | 201.56平米 | 西北    26人关注 / 8月前发布  近地铁   \n",
       "12     中楼层(共33层) | 2017年建 | 3室1厅 | 81.19平米 | 南    94人关注 / 5月前发布  NaN   \n",
       "13     中楼层(共18层) | 2007年建 | 2室2厅 | 83.6平米 | 西北    50人关注 / 4月前发布  NaN   \n",
       "14    高楼层(共32层) | 2009年建 | 2室1厅 | 55.61平米 | 东北   114人关注 / 8月前发布  近地铁   \n",
       "15    中楼层(共34层) | 2008年建 | 2室2厅 | 82.13平米 | 东南   168人关注 / 5月前发布  NaN   \n",
       "16         高楼层(共6层) | 2007年建 | 3室1厅 | 96平米 | 南   164人关注 / 1年前发布  NaN   \n",
       "17   中楼层(共17层) | 2004年建 | 3室2厅 | 133.96平米 | 东北    83人关注 / 4月前发布  NaN   \n",
       "18    低楼层(共29层) | 2009年建 | 2室2厅 | 85.21平米 | 东南   110人关注 / 6月前发布  近地铁   \n",
       "19    高楼层(共6层) | 1990年建 | 3室2厅 | 111.68平米 | 东南   130人关注 / 9月前发布  近地铁   \n",
       "20        低楼层(共35层) | 2016年建 | 3室2厅 | 87平米 | 东    80人关注 / 9月前发布  近地铁   \n",
       "21    高楼层(共29层) | 2008年建 | 2室2厅 | 123.48平米 | 南    93人关注 / 4月前发布  近地铁   \n",
       "22    低楼层(共34层) | 2012年建 | 1室1厅 | 40.47平米 | 东南   50人关注 / 11月前发布  近地铁   \n",
       "23    高楼层(共28层) | 2010年建 | 1室1厅 | 52.49平米 | 东南   34人关注 / 10月前发布  近地铁   \n",
       "24    高楼层(共43层) | 2018年建 | 3室2厅 | 133.72平米 | 北    95人关注 / 2月前发布  近地铁   \n",
       "25     低楼层(共34层) | 2010年建 | 2室1厅 | 51.23平米 | 南    60人关注 / 4月前发布  NaN   \n",
       "26       低楼层(共27层) | 2017年建 | 1室1厅 | 43平米 | 东南    50人关注 / 2月前发布  NaN   \n",
       "27     中楼层(共16层) | 1999年建 | 2室1厅 | 73.49平米 | 南    99人关注 / 9月前发布  近地铁   \n",
       "28    中楼层(共32层) | 2017年建 | 3室2厅 | 71.79平米 | 东北    92人关注 / 4月前发布  近地铁   \n",
       "29       高楼层(共6层) | 2006年建 | 3室1厅 | 123平米 | 东南  110人关注 / 10月前发布  NaN   \n",
       "30   低楼层(共18层) | 2010年建 | 2室1厅 | 80.97平米 | 南 北    58人关注 / 3月前发布  近地铁   \n",
       "31    高楼层(共6层) | 2004年建 | 3室2厅 | 124.68平米 | 西北   135人关注 / 5月前发布  NaN   \n",
       "32    低楼层(共3层) | 2007年建 | 3室1厅 | 112.14平米 | 东南   111人关注 / 2年前发布  近地铁   \n",
       "33     中楼层(共24层) | 2003年建 | 2室1厅 | 86.84平米 | 南    87人关注 / 9月前发布  近地铁   \n",
       "34    高楼层(共34层) | 2013年建 | 2室1厅 | 70.64平米 | 东南    66人关注 / 5月前发布  近地铁   \n",
       "35    中楼层(共42层) | 2016年建 | 3室2厅 | 84.28平米 | 东南   294人关注 / 1年前发布  近地铁   \n",
       "36       高楼层(共33层) | 2010年建 | 2室1厅 | 90平米 | 西南    78人关注 / 7月前发布  NaN   \n",
       "37     低楼层(共22层) | 2007年建 | 2室1厅 | 67.59平米 | 南    42人关注 / 4月前发布  NaN   \n",
       "38      高楼层(共31层) | 2016年建 | 5室2厅 | 230平米 | 东南    87人关注 / 7月前发布  NaN   \n",
       "39    高楼层(共18层) | 2007年建 | 2室1厅 | 80.23平米 | 东南    49人关注 / 4月前发布  NaN   \n",
       "40    高楼层(共38层) | 2017年建 | 2室2厅 | 62.47平米 | 东南    33人关注 / 5月前发布  近地铁   \n",
       "41     高楼层(共34层) | 2013年建 | 3室2厅 | 84.44平米 | 东    54人关注 / 3月前发布  NaN   \n",
       "42       高楼层(共33层) | 2012年建 | 1室1厅 | 47平米 | 东南  120人关注 / 10月前发布  NaN   \n",
       "43     高楼层(共6层) | 2011年建 | 1室1厅 | 43.28平米 | 东南    80人关注 / 6月前发布  NaN   \n",
       "44                  高楼层(共35层) 2室2厅 | 73平米 | 东南    43人关注 / 9月前发布  NaN   \n",
       "45    低楼层(共11层) | 2004年建 | 2室1厅 | 79.24平米 | 西北    14人关注 / 5月前发布  NaN   \n",
       "46     中楼层(共9层) | 2000年建 | 3室1厅 | 88.34平米 | 东南   126人关注 / 7月前发布  近地铁   \n",
       "47                中楼层(共8层) 3室1厅 | 65.78平米 | 西南    11人关注 / 2月前发布  近地铁   \n",
       "48  高楼层(共25层) | 2000年建 | 3室2厅 | 140.93平米 | 南 西    32人关注 / 8月前发布  近地铁   \n",
       "49    低楼层(共41层) | 2013年建 | 3室2厅 | 93.14平米 | 西北    79人关注 / 5月前发布  NaN   \n",
       "50       高楼层(共34层) | 2012年建 | 2室1厅 | 79平米 | 东南    57人关注 / 5月前发布  NaN   \n",
       "\n",
       "    单价(元/平米)  bedrooms  livingrooms  area_sqm  build_year  price_val  \n",
       "1    26959.4       NaN          NaN   26959.4         NaN    26959.4  \n",
       "2    22612.8       NaN          NaN   22612.8         NaN    22612.8  \n",
       "3    18014.5       NaN          NaN   18014.5         NaN    18014.5  \n",
       "4    13513.5       NaN          NaN   13513.5         NaN    13513.5  \n",
       "5    13220.3       NaN          NaN   13220.3         NaN    13220.3  \n",
       "6    43299.0       NaN          NaN   43299.0         NaN    43299.0  \n",
       "7    17000.0       NaN          NaN   17000.0         NaN    17000.0  \n",
       "8    17785.5       NaN          NaN   17785.5         NaN    17785.5  \n",
       "9    11199.3       NaN          NaN   11199.3         NaN    11199.3  \n",
       "10   12546.2       NaN          NaN   12546.2         NaN    12546.2  \n",
       "11   39690.4       NaN          NaN   39690.4         NaN    39690.4  \n",
       "12   27712.8       NaN          NaN   27712.8         NaN    27712.8  \n",
       "13   12559.8       NaN          NaN   12559.8         NaN    12559.8  \n",
       "14   18701.7       NaN          NaN   18701.7         NaN    18701.7  \n",
       "15   13636.9       NaN          NaN   13636.9         NaN    13636.9  \n",
       "16   15625.0       NaN          NaN   15625.0         NaN    15625.0  \n",
       "17   17020.0       NaN          NaN   17020.0         NaN    17020.0  \n",
       "18   17251.5       NaN          NaN   17251.5         NaN    17251.5  \n",
       "19   11192.7       NaN          NaN   11192.7         NaN    11192.7  \n",
       "20   17471.3       NaN          NaN   17471.3         NaN    17471.3  \n",
       "21   31179.1       NaN          NaN   31179.1         NaN    31179.1  \n",
       "22   34099.3       NaN          NaN   34099.3         NaN    34099.3  \n",
       "23   12383.3       NaN          NaN   12383.3         NaN    12383.3  \n",
       "24   25800.2       NaN          NaN   25800.2         NaN    25800.2  \n",
       "25   16982.2       NaN          NaN   16982.2         NaN    16982.2  \n",
       "26   19767.4       NaN          NaN   19767.4         NaN    19767.4  \n",
       "27   25581.7       NaN          NaN   25581.7         NaN    25581.7  \n",
       "28   20197.8       NaN          NaN   20197.8         NaN    20197.8  \n",
       "29   15447.2       NaN          NaN   15447.2         NaN    15447.2  \n",
       "30   13832.3       NaN          NaN   13832.3         NaN    13832.3  \n",
       "31   19890.9       NaN          NaN   19890.9         NaN    19890.9  \n",
       "32   31746.0       NaN          NaN   31746.0         NaN    31746.0  \n",
       "33   14970.1       NaN          NaN   14970.1         NaN    14970.1  \n",
       "34   15727.6       NaN          NaN   15727.6         NaN    15727.6  \n",
       "35   21120.1       NaN          NaN   21120.1         NaN    21120.1  \n",
       "36   16444.4       NaN          NaN   16444.4         NaN    16444.4  \n",
       "37   11096.3       NaN          NaN   11096.3         NaN    11096.3  \n",
       "38   32608.7       NaN          NaN   32608.7         NaN    32608.7  \n",
       "39   13959.9       NaN          NaN   13959.9         NaN    13959.9  \n",
       "40   17448.4       NaN          NaN   17448.4         NaN    17448.4  \n",
       "41   19895.8       NaN          NaN   19895.8         NaN    19895.8  \n",
       "42   24042.6       NaN          NaN   24042.6         NaN    24042.6  \n",
       "43   11552.7       NaN          NaN   11552.7         NaN    11552.7  \n",
       "44   19041.1       NaN          NaN   19041.1         NaN    19041.1  \n",
       "45   25239.8       NaN          NaN   25239.8         NaN    25239.8  \n",
       "46   19357.0       NaN          NaN   19357.0         NaN    19357.0  \n",
       "47   12921.9       NaN          NaN   12921.9         NaN    12921.9  \n",
       "48   12772.3       NaN          NaN   12772.3         NaN    12772.3  \n",
       "49   15031.1       NaN          NaN   15031.1         NaN    15031.1  \n",
       "50   15949.4       NaN          NaN   15949.4         NaN    15949.4  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m在当前单元格或上一个单元格中执行代码时 Kernel 崩溃。\n",
      "\u001b[1;31m请查看单元格中的代码，以确定故障的可能原因。\n",
      "\u001b[1;31m单击<a href='https://aka.ms/vscodeJupyterKernelCrash'>此处</a>了解详细信息。\n",
      "\u001b[1;31m有关更多详细信息，请查看 Jupyter <a href='command:jupyter.viewOutput'>log</a>。"
     ]
    }
   ],
   "source": [
    "# 任务1.3：二手房数据处理实现（执行此单元）\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "from pathlib import Path\n",
    "from IPython.display import display\n",
    "\n",
    "# 读取所有 sheet 并合并的函数\n",
    "def read_all_sheets(path):\n",
    "    path = Path(path)\n",
    "    if not path.exists():\n",
    "        raise FileNotFoundError(f\"{path} 未找到，请把文件放在工作目录或修改路径。当前路径：{Path.cwd()}\")\n",
    "    try:\n",
    "        sheets = pd.read_excel(path, sheet_name=None)\n",
    "    except Exception as e:\n",
    "        raise IOError(f\"读取 Excel 失败: {e}\")\n",
    "    df = pd.concat(sheets.values(), ignore_index=True, sort=False)\n",
    "    return df\n",
    "\n",
    "# 根据关键词尝试识别列名\n",
    "def find_column(df, keywords):\n",
    "    low = {c: c.lower() for c in df.columns}\n",
    "    for c,l in low.items():\n",
    "        for kw in keywords:\n",
    "            if kw in l:\n",
    "                return c\n",
    "    return None\n",
    "\n",
    "# 提取数字（含万/k/㎡/m2）\n",
    "def extract_number(s):\n",
    "    if pd.isna(s):\n",
    "        return np.nan\n",
    "    s = str(s).lower().replace('，',',').replace('／','/').replace('－','-').replace('—','-')\n",
    "    # 找含数字的部分\n",
    "    m = re.search(r'(\\d+[\\.,]?\\d*)', s)\n",
    "    if not m:\n",
    "        return np.nan\n",
    "    num = float(m.group(1).replace(',',''))\n",
    "    # 单位处理\n",
    "    if '万' in s and not ('元' in s and '万' not in s):\n",
    "        return num * 10000\n",
    "    if re.search(r'\\bk\\b', s):\n",
    "        return num * 1000\n",
    "    return num\n",
    "\n",
    "# 主体处理开始\n",
    "file = Path('handroom.xlsx')\n",
    "df = read_all_sheets(file)\n",
    "print('原始数据行数:', len(df), '列数:', df.shape[1])\n",
    "print('列名：', list(df.columns))\n",
    "\n",
    "# 识别小区列（社区/小区）\n",
    "community_col = find_column(df, ['小区','社区','小区名称','小区名','community','village'])\n",
    "if community_col is None:\n",
    "    # 回退到第一个字符串列\n",
    "    for c in df.columns:\n",
    "        if df[c].dtype == object or pd.api.types.is_string_dtype(df[c]):\n",
    "            community_col = c\n",
    "            break\n",
    "print('使用的小区列:', community_col)\n",
    "\n",
    "# 删除小区缺失行\n",
    "before = len(df)\n",
    "df[community_col] = df[community_col].astype(str).str.strip().replace({'nan':None,'none':None})\n",
    "df = df.dropna(subset=[community_col])\n",
    "print(f'删除小区缺失：{before - len(df)} 行，剩余 {len(df)} 行')\n",
    "\n",
    "# 识别价格列（单价或总价）\n",
    "price_col = find_column(df, ['单价','总价','价格','元','price','成交价','售价','价'])\n",
    "if price_col is None:\n",
    "    # 尝试数值列作为价格\n",
    "    for c in df.columns:\n",
    "        if pd.api.types.is_numeric_dtype(df[c]):\n",
    "            price_col = c\n",
    "            break\n",
    "print('使用的价格列:', price_col)\n",
    "\n",
    "# 识别面积列\n",
    "area_col = find_column(df, ['面积','㎡','平米','m2','m²','area'])\n",
    "print('识别的面积列:', area_col)\n",
    "\n",
    "# 识别建造年份或房龄列\n",
    "year_col = find_column(df, ['建造','建成','年份','竣工','year','建成年份','房龄','age'])\n",
    "print('识别的建造年份/房龄列:', year_col)\n",
    "\n",
    "# 识别楼层/楼层类型列\n",
    "floor_col = find_column(df, ['楼层','楼层类型','层次','floor'])\n",
    "print('识别的楼层列:', floor_col)\n",
    "\n",
    "# 识别户型/房型列\n",
    "house_type_col = find_column(df, ['户型','房型','室','room','house_type'])\n",
    "print('识别的户型列:', house_type_col)\n",
    "\n",
    "# 尝试拆分户型（如 2室1厅 -> bedrooms=2, living=1）\n",
    "if house_type_col is not None:\n",
    "    def parse_house_type(s):\n",
    "        if pd.isna(s):\n",
    "            return (np.nan, np.nan)\n",
    "        s = str(s)\n",
    "        m = re.search(r'(\\d+)\\s*室', s)\n",
    "        a = int(m.group(1)) if m else np.nan\n",
    "        m2 = re.search(r'(\\d+)\\s*厅', s)\n",
    "        b = int(m2.group(1)) if m2 else np.nan\n",
    "        return (a,b)\n",
    "    parsed = df[house_type_col].apply(parse_house_type)\n",
    "    df['bedrooms'] = parsed.apply(lambda x: x[0])\n",
    "    df['livingrooms'] = parsed.apply(lambda x: x[1])\n",
    "else:\n",
    "    df['bedrooms'] = np.nan\n",
    "    df['livingrooms'] = np.nan\n",
    "\n",
    "# 解析面积为数值\n",
    "if area_col is not None:\n",
    "    df['area_sqm'] = df[area_col].apply(extract_number)\n",
    "else:\n",
    "    df['area_sqm'] = np.nan\n",
    "\n",
    "# 处理建造年份/房龄：如果是房龄（年）则转为建造年份（当前年 - age）\n",
    "from datetime import datetime\n",
    "cur_year = datetime.now().year\n",
    "df['build_year'] = np.nan\n",
    "if year_col is not None:\n",
    "    def parse_year(x):\n",
    "        if pd.isna(x):\n",
    "            return np.nan\n",
    "        s = str(x)\n",
    "        # 若为纯数字 4 位，认为是年份\n",
    "        m = re.search(r'(19\\d{2}|20\\d{2})', s)\n",
    "        if m:\n",
    "            return int(m.group(1))\n",
    "        # 若为年龄（如 5年），提取数字并减去\n",
    "        m2 = re.search(r'(\\d+)\\s*年', s)\n",
    "        if m2:\n",
    "            age = int(m2.group(1))\n",
    "            return cur_year - age\n",
    "        # 若为小数字，认为是房龄\n",
    "        m3 = re.search(r'^(\\d{1,2})$', s.strip())\n",
    "        if m3:\n",
    "            age = int(m3.group(1))\n",
    "            return cur_year - age\n",
    "        return np.nan\n",
    "    df['build_year'] = df[year_col].apply(parse_year)\n",
    "else:\n",
    "    df['build_year'] = np.nan\n",
    "\n",
    "# 将价格解析为数值（如果包含'万'则乘以10000）\n",
    "def parse_price(x):\n",
    "    if pd.isna(x):\n",
    "        return np.nan\n",
    "    s = str(x).lower()\n",
    "    # 去掉逗号等\n",
    "    s = s.replace(',','').replace('，','')\n",
    "    m = re.search(r'(\\d+[\\.,]?\\d*)', s)\n",
    "    if not m:\n",
    "        return np.nan\n",
    "    val = float(m.group(1))\n",
    "    if '万' in s and '元' not in s:\n",
    "        val = val * 10000\n",
    "    if re.search(r'\\bk\\b', s):\n",
    "        val = val * 1000\n",
    "    return val\n",
    "\n",
    "df['price_val'] = df[price_col].apply(parse_price) if price_col is not None else np.nan\n",
    "\n",
    "# 重复值处理：删除完全重复的行并重置索引\n",
    "before_dup = len(df)\n",
    "df = df.drop_duplicates().reset_index(drop=True)\n",
    "print(f'删除重复行数: {before_dup - len(df)}')\n",
    "\n",
    "# 异常值处理：针对\"翡翠城四期\"按 price_val 使用 IQR 去异常\n",
    "target_community = '翡翠城四期'\n",
    "if community_col is not None and price_col is not None:\n",
    "    mask = df[community_col].astype(str).str.contains(target_community)\n",
    "    subset = df[mask & df['price_val'].notna()]\n",
    "    if len(subset) > 0:\n",
    "        q1 = subset['price_val'].quantile(0.25)\n",
    "        q3 = subset['price_val'].quantile(0.75)\n",
    "        iqr = q3 - q1\n",
    "        low = q1 - 1.5 * iqr\n",
    "        high = q3 + 1.5 * iqr\n",
    "        before_out = len(df[mask])\n",
    "        df = df[~(mask & ((df['price_val'] < low) | (df['price_val'] > high)))]\n",
    "        after_out = len(df[df[community_col].astype(str).str.contains(target_community)])\n",
    "        print(f'在\"{target_community}\"中，移除异常前 {before_out} 行，移除后 {after_out} 行')\n",
    "    else:\n",
    "        print(f'在\"{target_community}\"中未找到可用价格数据用于异常检测（样本量 {len(subset)})')\n",
    "else:\n",
    "    print('无法对异常值进行检测：未识别到小区列或价格列')\n",
    "\n",
    "# 保存清洗结果\n",
    "out = Path('handroom_cleaned.csv')\n",
    "try:\n",
    "    df.to_csv(out, index=False, encoding='utf-8-sig')\n",
    "    print(f'已保存清洗后数据到: {out}')\n",
    "except Exception as e:\n",
    "    print('保存失败:', e)\n",
    "\n",
    "# 统计分析：每个建造年份的房屋数、不同楼层类型的数量、每个小区的房屋平均价\n",
    "print('\\n统计：每个建造年份的房屋数（按 build_year）')\n",
    "by_year = df['build_year'].dropna().astype(int).value_counts().sort_index()\n",
    "display(by_year)\n",
    "\n",
    "print('\\n统计：不同楼层类型的数量（按原楼层字段显示）')\n",
    "if floor_col is not None:\n",
    "    display(df[floor_col].value_counts())\n",
    "else:\n",
    "    print('未识别到楼层列')\n",
    "\n",
    "print('\\n统计：每个小区的房屋平均价（price_val）——按小区分组，取均值，显示前50')\n",
    "if price_col is not None:\n",
    "    avg_price = df.groupby(community_col)['price_val'].mean().sort_values(ascending=False)\n",
    "    display(avg_price.head(50))\n",
    "else:\n",
    "    print('未识别到价格列，无法计算平均价')\n",
    "\n",
    "print('\\n示例数据（前50行）:')\n",
    "display(df.head(50))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py39",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.23"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
